clear all
set more off
cap log close

***************************************************************************************************
* Program: 2_visits_data_prep.do
***************************************************************************************************

************************************************************************
** set global macros
************************************************************************
global root_dir "~/Dropbox/Prison_Covid/Visitation misconduct paper/Science Advances R&R/Replication"
global data_raw "${root_dir}/Data/Raw"
global gas_price_dir "${data_raw}/gas_price"
global compstat_dir "${data_raw}/cdcr_compstat"
global macroecon_data "${data_raw}/macro_economic_data"
global boundaries_dir "${data_raw}/boundaries"
global fire_dir "${data_raw}/california_historic_fire_perimeters_data"
global weather_dir "${data_raw}/weather"
global processed_dir "${root_dir}/Data/Processed"

************************************************************************
* Step 1: Count # of CA Residents
************************************************************************

/*
cap frame change default 
cap frame drop results 
frame create results
frame results{
	
	set obs 35 // 35 year-month
	
	* initializee regression coefficients and standard errors
	gen year = .
	gen month = .
	gen obs = .
	
}

local i = 1

forval yr = 2017/2020{

	forval mon = 1/12{

		capture confirm file "${data_raw}/phones/CA_Residents_`yr'_`mon'.dta"
		
		if _rc == 0 {
			
			display "`i'"
			
			use "${data_raw}/phones/CA_Residents_`yr'_`mon'.dta", clear
			gunique ID
			local obs = "`=r(unique)'"

			frame results{
				replace year = `yr' in `i'
				replace month = `mon' in `i'
				replace obs = `obs' in `i'
				local i = `i' + 1
			}
			
		}
	
	}
				
}	

frame change results

save "${processed_dir}/phones/CA_residents_count_2017_2020.dta", replace 
*/


************************************************************************
** STEP 2. Build monthly prisons visit data
************************************************************************

use "${processed_dir}/phones/Geo7HH_in_prison_2017_StatePrison_w_phone_id.dta", clear 
forval yr = 2018/2020{
	append using "${processed_dir}/phones/Geo7HH_in_prison_`yr'_StatePrison_w_phone_id.dta"
}
gen year = year(Stata_Date_local)
gen week = week(Stata_Date_local)

gunique Stata_Date_local, by(phone_id year) generate(total_days_year)
gunique Stata_Date_local if weekend == 0, by(phone_id year) generate(total_weekdays_year_)
gegen total_weekdays_year = max(total_weekdays_year_), by(phone_id year)
gunique Stata_Date_local if weekend == 1, by(phone_id year) generate(total_visit_days_year_)
gegen total_visit_days_year = max(total_visit_days_year_), by(phone_id year)

drop total_weekdays_year_ total_visit_days_year_

gunique phone_id if (Hour_local >= 0 & Hour_local <= 4), by(year week Shapefile_ID) generate(uniq_phones_night_)
gegen uniq_phones_night = max(uniq_phones_night_), by(year week Shapefile_ID)

// only keep visitation related
keep if weekend == 1 // & Hour_local >= 7 & Hour_local <= 15

merge m:1 gidbg using "${data_raw}/acs_bg_demographics_15_19.dta", keep(master match) nogenerate ///
	keepusing(pct_hispanic_acs_15_19 pct_nh_white_alone_acs_15_19 pct_nh_blk_alone_acs_15_19 pct_nh_asian_alone_acs_15_19 pct_college_acs_15_19 med_hhd_inc_bg_acs_15_19) 


gen byte visit_hour = Hour_local >= 7 & Hour_local <= 15
egen tag_id_prison_week = tag(year week Shapefile_ID phone_id)
egen tag_id_prison_week_vh = tag(year week Shapefile_ID phone_id visit_hour)


* Calculate average demograhpics for visitors 
gegen avg_income_1k_ = mean(med_hhd_inc_bg_acs_15_19) if (weekend==1) & (tag_id_prison_week == 1), by(Shapefile_ID week)
gegen avg_income_1k = max(avg_income_1k_), by(year week Shapefile_ID)

gegen avg_pct_hisp_ = mean(pct_hispanic_acs_15_19) if (weekend==1) & (tag_id_prison_week == 1), by(Shapefile_ID week)
gegen avg_pct_hisp = max(avg_pct_hisp_), by(Shapefile_ID week)

gegen avg_pct_white_ = mean(pct_nh_white_alone_acs_15_19) if (weekend==1) & (tag_id_prison_week == 1), by(Shapefile_ID week)
gegen avg_pct_white = max(avg_pct_white_),  by(Shapefile_ID week)

gegen avg_pct_black_ = mean(pct_nh_blk_alone_acs_15_19) if (weekend==1) & (tag_id_prison_week == 1), by(Shapefile_ID week)
gegen avg_pct_black = max(avg_pct_black_), by(Shapefile_ID week)

gegen avg_pct_asian_ = mean(pct_nh_asian_alone_acs_15_19) if (weekend==1) & (tag_id_prison_week == 1), by(Shapefile_ID week)
gegen avg_pct_asian = max(avg_pct_asian_), by(Shapefile_ID week)

gegen avg_pct_college_ = mean(pct_college_acs_15_19) if (weekend==1) & (tag_id_prison_week == 1), by(Shapefile_ID week)
gegen avg_pct_college = max(avg_pct_college_), by(Shapefile_ID week)
	
* visit hour only 
gegen avg_income_1k_vh_ = mean(med_hhd_inc_bg_acs_15_19) if (weekend==1) & (tag_id_prison_week_vh == 1), by(Shapefile_ID week)
gegen avg_income_1k_vh = max(avg_income_1k_vh_), by(year week Shapefile_ID)

gegen avg_pct_hisp_vh_ = mean(pct_hispanic_acs_15_19) if (weekend==1) & (tag_id_prison_week_vh == 1), by(Shapefile_ID week)
gegen avg_pct_hisp_vh = max(avg_pct_hisp_vh_), by(Shapefile_ID week)

gegen avg_pct_white_vh_ = mean(pct_nh_white_alone_acs_15_19) if (weekend==1) & (tag_id_prison_week_vh == 1), by(Shapefile_ID week)
gegen avg_pct_white_vh = max(avg_pct_white_vh_),  by(Shapefile_ID week)

gegen avg_pct_black_vh_ = mean(pct_nh_blk_alone_acs_15_19) if (weekend==1) & (tag_id_prison_week_vh == 1), by(Shapefile_ID week)
gegen avg_pct_black_vh = max(avg_pct_black_vh_), by(Shapefile_ID week)

gegen avg_pct_asian_vh_ = mean(pct_nh_asian_alone_acs_15_19) if (weekend==1) & (tag_id_prison_week_vh == 1), by(Shapefile_ID week)
gegen avg_pct_asian_vh = max(avg_pct_asian_vh_), by(Shapefile_ID week)

gegen avg_pct_college_vh_ = mean(pct_college_acs_15_19) if (weekend==1) & (tag_id_prison_week_vh == 1), by(Shapefile_ID week)
gegen avg_pct_college_vh = max(avg_pct_college_vh_), by(Shapefile_ID week)

// get first vs. other visitors 
bysort Shapefile_ID phone_id (Stata_Date_local): gen first_visit_day = Stata_Date_local[1]
format first_visit_day %td

gen is_new_visit = (Stata_Date_local == first_visit_day)
gen is_repeat_visit = (Stata_Date_local > first_visit_day)
gegen new_visit_week = max(is_new_visit), by(phone_id Shapefile_ID week year)
gegen time_in_prisons_7to15 = total(visit_hour * weekend), by(Shapefile_ID phone_id Stata_Date_local)

* calculate sum of unique visitors to each prison (during fri_sat_sun) each weekend
gunique phone_id if weekend, by(year week Shapefile_ID) generate(uniq_phones_)
gegen uniq_phones = max(uniq_phones_), by(year week Shapefile_ID)

gunique phone_id if weekend & visit_hour, by(year week Shapefile_ID) generate(uniq_phones_7to15_)
gegen uniq_phones_7to15 = max(uniq_phones_7to15_), by(year week Shapefile_ID)	

gunique phone_id if weekend & visit_hour & (time_in_prisons_7to15 > 1) , by(year week Shapefile_ID) generate(uniq_phones_7to15_1hr_)
gegen uniq_phones_7to15_1hr = max(uniq_phones_7to15_1hr_), by(year week Shapefile_ID)

gunique phone_id if weekend & visit_hour & new_visit_week, by(year week Shapefile_ID) generate(uniq_phones_new_)
gegen uniq_phones_new = max(uniq_phones_new_), by(year week Shapefile_ID)

gunique phone_id if weekend & visit_hour & new_visit_week == 0, by(year week Shapefile_ID) generate(uniq_phones_repeat_)
gegen uniq_phones_repeat = max(uniq_phones_repeat_), by(year week Shapefile_ID)

drop avg_*_ uniq*_

* collapse to prison_week 
gsort year week Shapefile_ID Stata_Date_local
drop if year == year[_n-1] & week == week[_n-1] &  Shapefile_ID == Shapefile_ID[_n-1]

keep year week month Shapefile_ID uniq* avg* NAME prison_name 

// if any of the variables are missing, means that they should be zero 
local vars "uniq_phones uniq_phones_7to15 uniq_phones_7to15_1hr uniq_phones_night uniq_phones_new uniq_phones_repeat" 
foreach var of local vars{
	replace `var' = 0 if missing(`var')
}

replace month = 3 if week == 13 & month == 4 & year == 2018 
replace month = 6 if week == 26 & month == 7  & year == 2018 // make July 1 -> week 26 to be month 6 
 
 
// assert each week is corresponding to only one month 
gunique month, by(year week) generate(num_month)
assert num_month <= 1 if month!=.

gegen max_month = max(month), by(year week)
drop month 
rename max_month month

save "${processed_dir}/phones/weekly_state_prisons_visits_2017_2020.dta", replace 

use "${processed_dir}/phones/weekly_state_prisons_visits_2017_2020.dta",  clear
//rename tot_HHs_FriSatSun HHs_FriSatSun
rename uniq_phones visits
rename uniq_phones_* visits_*

// aggregate prisonXweek level to prison-month level 
local vars "visits visits_7to15 visits_7to15_1hr visits_night visits_new visits_repeat"
foreach var of local vars{
	gegen tot_`var' = total(`var'), by(NAME year month)
}

sort NAME year month 
drop if NAME == NAME[_n-1] & year == year[_n-1] & month == month[_n-1]

keep NAME year month tot_* 

drop if month == 1 & year == 2017
drop if month == 12 & year == 2017
drop if month <= 2 & year == 2018
drop if month > 3 & year == 2020
sort year month

merge m:1 year month using "${processed_dir}/phones/CA_residents_count_2017_2020.dta", nogenerate

gen norm_visits = tot_visits / obs * 1000000
gen norm_visits_7to15 = tot_visits_7to15 / obs * 1000000
gen norm_visits_7to15_1hr = tot_visits_7to15_1hr / obs * 1000000
gen norm_visits_night = tot_visits_night / obs * 1000000
gen norm_visits_new = tot_visits_new / obs * 1000000
gen norm_visits_repeat = tot_visits_repeat / obs * 1000000

label variable norm_visits "Normalized Visits"
label variable norm_visits_7to15 "Normalized Visits (During 7am-3pm)"
label variable norm_visits_7to15_1hr  "Normalized Visits (During 7am-3pm, time in prison >= 1h)"

save "${processed_dir}/phones/monthly_prisons_visit_multi_year.dta", replace 


****************************
// obtain wavg_dist_zcta_to_prison
****************************
use "${data_raw}/num_prisoners_prison_zcta_exploded", clear // based on CDCR roster data

// get geometric center of ZCTA 
rename zcta GEOID10 
merge m:1 GEOID10 using "${data_raw}/tl_2019_us_zcta510_data.dta", ///
	keepusing(INTPTLAT10 INTPTLON10) keep(master match) nogenerate 
rename GEOID10 zcta 

destring INTPTLAT10 INTPTLON10, replace 
rename INTPTLAT10 zcta_lat 
rename INTPTLON10 zcta_lon

// merge to get prison name (formatted differently), FID (to merge later) and address
merge m:1 prison_acronym using "${data_raw}/CA_station_prison_address.dta", ///
	keepusing(FID NAME)

drop if _merge == 2 
drop _merge 	

// merge to get prison's geocoded lat-long
merge m:1 FID using "${data_raw}/CA_station_prison_address_google_geocoded.dta", ///
	keepusing(lat_google lon_google loc_type) keep(master match) nogenerate 
	
// calculate the straight line distance between prison and zcta center 
geodist lat_google lon_google zcta_lat zcta_lon, generate(dist_prison_zcta)

// weighted average distance from ZCTA to distance 
// weight = the estimated num_prisoners_zcta
gegen wavg_dist_ = mean(dist_prison_zcta) [w = num_prisoners_zcta], by(NAME)
gegen wavg_dist_zcta_to_prison = max(wavg_dist_), by(NAME)

sort NAME 
drop if NAME == NAME[_n-1]

keep NAME prison_name prison_name_full prison_acronym FID wavg_dist_zcta_to_prison

egen dist_rank = rank(wavg_dist_zcta_to_prison)
gquantiles dist_quartile = wavg_dist_zcta_to_prison, nquantiles(4) xtile

save "${processed_dir}/wavg_dist_zcta_to_prison.dta", replace


************************************************************************
* compile all years data 
************************************************************************

use "${processed_dir}/phones/Geo7HH_in_prison_2017_StatePrison_w_phone_id.dta", clear 
forval yr = 2018/2020{
	append using "${processed_dir}/phones/Geo7HH_in_prison_`yr'_StatePrison_w_phone_id.dta"
}

// only keep visitation related
keep if weekend == 1 // & Hour_local >= 7 & Hour_local <= 15

// visit-day-device data
sort phone_id Stata_Date_local prison_name
drop if phone_id == phone_id[_n-1] & Stata_Date_local==Stata_Date_local[_n-1] & prison_name==prison_name[_n-1]

keep phone_id gidbg prison_name Stata_Date_local geohash_7 Lat Long month

gunique phone_id, by(gidbg) generate(num_phone)
drop if num_phone == 1 // drop block group with only 1 phone 

save  "${processed_dir}/phones/visits_phone_noised.dta", replace 
